{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "14e9ee3f",
   "metadata": {},
   "source": [
    "### INTERVAL covariates \n",
    "\n",
    "* Imputed cell count data \n",
    "* Some covariates have NaN: OD_260_230, Agilent_28S_18S, Agilent_Conc_ng_ul, Agilent_Yield_ng"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "a77ddce9",
   "metadata": {},
   "outputs": [],
   "source": [
    "from pathlib import Path \n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "e08602d1",
   "metadata": {},
   "outputs": [],
   "source": [
    "wkdir=\"/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/misexpression_v3\"\n",
    "wkdir_path = Path(wkdir)\n",
    "\n",
    "covariates_path = \"/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/lof_missense/phenotypes/rna_seq/processed_v97/covariates/master/master_covariates_v97_swapd_depth_fastq_rin_cell_sex_pcs_season_batch_fc_pipelines_updtd.tsv\"\n",
    "xcell_enrich_path =wkdir_path.joinpath(\"2_misexp_qc/xcell/xCell_estimates.tsv\")\n",
    "peer_covariates_path = wkdir_path.joinpath(\"2_misexp_qc/peer/input/peer_covariates.csv\") \n",
    "peer_out_dir = wkdir_path.joinpath(\"2_misexp_qc/peer/output_35factors_1000iter\") \n",
    "out_dir = wkdir_path.joinpath(\"2_misexp_qc/covariates\")\n",
    "out_dir.mkdir(parents=True, exist_ok=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "558c748f",
   "metadata": {},
   "outputs": [],
   "source": [
    "### correlation with covariates \n",
    "\n",
    "# read in covariates \n",
    "covariates_df = pd.read_csv(covariates_path, sep=\"\\t\")\n",
    "\n",
    "bio_covariates_list = ['age_RNA', 'height', 'weight', 'BMI', \"sex_0_1\"]\n",
    "cell_types_covariates_list = ['BA_D_10_9_L___RNA_imptd', 'BA_D_PCT___RNA_imptd', 'BA_N_10_9_L___RNA_imptd', \n",
    "            'BA_N_PCT___RNA_imptd', 'BASO_10_9_L___RNA_imptd', 'BASO_PCT___RNA_imptd', \n",
    "            'Delta_He_pg___RNA_imptd', 'EO_10_9_L___RNA_imptd', 'EO_PCT___RNA_imptd', \n",
    "            'FRC_10_12_L___RNA_imptd', 'FRC_PCT___RNA_imptd', 'H_IPF___RNA_imptd', \n",
    "            'HCT_PCT___RNA_imptd', 'HFLC_10_9_L___RNA_imptd', 'HFLC_PCT___RNA_imptd', \n",
    "            'HFR_PCT___RNA_imptd', 'HGB_g_dL___RNA_imptd', 'HYPER_He_PCT___RNA_imptd', \n",
    "            'HYPO_He_PCT___RNA_imptd', 'IG_10_9_L___RNA_imptd', 'IG_PCT___RNA_imptd', \n",
    "            'IPF___RNA_imptd', 'IPFx_10_9_L___RNA_imptd', 'IRF_PCT___RNA_imptd', \n",
    "            'IRF_Y_ch___RNA_imptd', 'LFR_PCT___RNA_imptd', 'LY_WX___RNA_imptd', \n",
    "            'LY_WY___RNA_imptd', 'LY_WZ___RNA_imptd', 'LY_X_ch___RNA_imptd', \n",
    "            'LY_Y_ch___RNA_imptd', 'LY_Z_ch___RNA_imptd', 'LYMP_10_9_L___RNA_imptd', \n",
    "            'LYMP_PCT___RNA_imptd', 'LYMPH_10_9_L___RNA_imptd', 'LYMPH_PCT___RNA_imptd', \n",
    "            'MacroR_PCT___RNA_imptd', 'MCH_pg___RNA_imptd', 'MCHC_g_dL___RNA_imptd', \n",
    "            'MCV_fL___RNA_imptd', 'MFR_PCT___RNA_imptd', 'MicroR_PCT___RNA_imptd', \n",
    "            'MO_WX___RNA_imptd', 'MO_WY___RNA_imptd', 'MO_WZ___RNA_imptd', 'MO_X_ch___RNA_imptd', \n",
    "            'MO_Y_ch___RNA_imptd', 'MO_Z_ch___RNA_imptd', 'MONO_10_9_L___RNA_imptd', \n",
    "            'MONO_PCT___RNA_imptd', 'MPV_fL___RNA_imptd', 'NE_FSC_ch___RNA_imptd', \n",
    "            'NE_SFL_ch___RNA_imptd', 'NE_SSC_ch___RNA_imptd', 'NE_WX___RNA_imptd', \n",
    "            'NE_WY___RNA_imptd', 'NE_WZ___RNA_imptd', 'NEUT_10_9_L___RNA_imptd', \n",
    "            'NEUT_PCT___RNA_imptd', 'NEUTx_10_9_L___RNA_imptd', 'NEUTx_PCT___RNA_imptd', \n",
    "            'NRBC_10_9_L___RNA_imptd', 'NRBC_PCT___RNA_imptd', 'P_LCR_PCT___RNA_imptd', \n",
    "            'PCT_PCT___RNA_imptd', 'PDW_fL___RNA_imptd', 'PLT_10_9_L___RNA_imptd', \n",
    "            'PLT_F_10_9_L___RNA_imptd', 'PLT_I_10_9_L___RNA_imptd', 'PLT_O_10_9_L___RNA_imptd', \n",
    "            'RBC_10_12_L___RNA_imptd', 'RBC_He_pg___RNA_imptd', 'RBC_O_10_12_L___RNA_imptd', \n",
    "            'RDW_CV_PCT___RNA_imptd', 'RDW_SD_fL___RNA_imptd', 'RET_10_6_uL___RNA_imptd', \n",
    "            'RET_He_pg___RNA_imptd', 'RET_PCT___RNA_imptd', 'RET_RBC_Y_ch___RNA_imptd', \n",
    "            'RET_TNC___RNA_imptd', 'RET_UPP___RNA_imptd', 'RET_Y_ch___RNA_imptd', 'RPI___RNA_imptd', \n",
    "            'TNC_10_9_L___RNA_imptd', 'TNC_D_10_9_L___RNA_imptd', 'TNC_N_10_9_L___RNA_imptd', \n",
    "            'WBC_10_9_L___RNA_imptd', 'WBC_D_10_9_L___RNA_imptd', 'WBC_N_10_9_L___RNA_imptd']\n",
    "            \n",
    "tech_covariates_list = ['Conc_ng_ul', 'OD_260_280','OD_260_230','Yield_ng','Agilent_28S_18S',\n",
    "                        'Agilent_Conc_ng_ul', 'Agilent_Yield_ng',\n",
    "                        'Agilent_RINe_imptd_by_batch','Assigned', 'Unassigned_MultiMapping', \n",
    "                        'Unassigned_NoFeatures', 'Unassigned_Ambiguity','gc_percent_forward_read', \n",
    "                        'gc_percent_reverse_read', 'adapters_percent_forward_read', \n",
    "                        'adapters_percent_reverse_read', 'percent_mapped', 'percent_duplicate', \n",
    "                        'rna_exonic_rate', 'rna_rrna_rate', 'rna_globin_percent_tpm', \n",
    "                        'rna_mitochondrial_percent_tpm', 'num_reads', 'RawReadDepth',\n",
    "                        'RawReadDepth_fromFastQFile']\n",
    "\n",
    "other_covariates = ['PC1','PC2','PC3','PC4','PC5','PC6','PC7','PC8','PC9','PC10','PC11','PC12',\n",
    "                    'PC13','PC14','PC15','PC16','PC17','PC18','PC19','PC20','Season_Winter',\n",
    "                    'Season_Autumn','Season_Spring', 'Season_Summer','sequencingBatch_10',\n",
    "                    'sequencingBatch_4','sequencingBatch_15','sequencingBatch_3','sequencingBatch_5',\n",
    "                    'sequencingBatch_8','sequencingBatch_2', 'sequencingBatch_6','sequencingBatch_14',\n",
    "                    'sequencingBatch_11','sequencingBatch_1','sequencingBatch_12','sequencingBatch_9',\n",
    "                    'sequencingBatch_7','sequencingBatch_13',]\n",
    "\n",
    "covariate_list = bio_covariates_list + cell_types_covariates_list + tech_covariates_list + other_covariates\n",
    "covariates_susbet_df = covariates_df[[\"rna_id\"] + covariate_list]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "7eabeb79",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of biological covariates: 5\n",
      "Number of Sysmex cell-types: 89\n",
      "Number of technical covariates: 25\n",
      "Number of other covariates: 39\n",
      "Total number of measured covariates: 158\n"
     ]
    }
   ],
   "source": [
    "print(f\"Number of biological covariates: {len(bio_covariates_list)}\")\n",
    "print(f\"Number of Sysmex cell-types: {len(cell_types_covariates_list)}\")\n",
    "print(f\"Number of technical covariates: {len(tech_covariates_list)}\")\n",
    "print(f\"Number of other covariates: {len(other_covariates)}\")\n",
    "print(f\"Total number of measured covariates: {len(covariate_list)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "77a9b9a7",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of xCell features: 67\n"
     ]
    }
   ],
   "source": [
    "# correlation with inferred cell enrichments (xCell)\n",
    "xcell_enrich_df = pd.read_csv(xcell_enrich_path, sep=\"\\t\")\n",
    "xcell_features = xcell_enrich_df.columns.tolist()\n",
    "print(f\"Number of xCell features: {len(xcell_features)}\")\n",
    "xcell_enrich_reidx_df = xcell_enrich_df.reset_index().rename(columns={\"index\":\"rna_id\"})\n",
    "\n",
    "covariates_xcell_df = pd.merge(covariates_susbet_df, \n",
    "                                 xcell_enrich_reidx_df, \n",
    "                                 on=\"rna_id\", \n",
    "                                 how=\"inner\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "c84911a2",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Total number of PEER factors: 35\n"
     ]
    }
   ],
   "source": [
    "### add peer factors \n",
    "# load PEER factors on 4,568 samples passing QC \n",
    "peer_factors_path = peer_out_dir.joinpath(\"X.csv\")\n",
    "peer_factor_df = pd.read_csv(peer_factors_path)\n",
    "peer_covariates_df = pd.read_csv(peer_covariates_path).rename(columns={\"Unnamed: 0\": \"rna_id\"})\n",
    "# add RNA ID to PEER factor columns \n",
    "peer_factor_df.columns = peer_covariates_df.rna_id.tolist()\n",
    "# add row names \n",
    "covariates = [col for col in peer_covariates_df.columns.tolist() if col != \"rna_id\"]\n",
    "peer_factors = [f\"PEER{n}\" for n in range(1,36)]\n",
    "print(f\"Total number of PEER factors: {len(peer_factors)}\")\n",
    "peer_factor_index = covariates + [\"intercept\"] + peer_factors\n",
    "peer_factor_df.index = peer_factor_index\n",
    "peer_factor_clean_df = peer_factor_df.transpose()[peer_factors]\n",
    "peer_factor_clean_df = peer_factor_clean_df.reset_index().rename(columns={\"index\": \"rna_id\"})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "a5abbd84",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Total number of covariates: 260\n"
     ]
    }
   ],
   "source": [
    "print(f\"Total number of covariates: {len(xcell_features) + len(covariate_list) + len(peer_factors)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "b2390943",
   "metadata": {},
   "outputs": [],
   "source": [
    "covariates_xcell_peer_df = pd.merge(covariates_xcell_df, \n",
    "                                       peer_factor_clean_df, \n",
    "                                       on=\"rna_id\", \n",
    "                                       how=\"inner\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "37e023e1",
   "metadata": {},
   "outputs": [],
   "source": [
    "covariates_xcell_path = out_dir.joinpath(\"covariates_xcell_peer.tsv\")\n",
    "covariates_xcell_peer_df.to_csv(covariates_xcell_path, sep=\"\\t\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b81b8bf0",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
